# Setup

In [1]:
import torch
import numpy as np
import json
import torch.nn as nn
from birdlib import utils

In [2]:
# sudo modprobe nvidia_uvm
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [24]:
DATASET_NAME = "dataset"
MODEL_NAME = 'DeeperCNN'
DATASET_VAR = 'wabad'

In [25]:
DATASET_PATH = f'../segments/{DATASET_NAME}'
TRAIN_PATH = f"{DATASET_PATH}/train"
TEST_PATH = f"{DATASET_PATH}/test"
MODEL_PATH = f'./models/{MODEL_NAME}'

# Load the model

In [26]:
with open(f"./utils/{DATASET_NAME}/dataset_config_{DATASET_VAR}.json") as f:
    dataset_config = json.load(f)

In [27]:
mappings = dataset_config["mappings"]

In [28]:
model_class = utils.load_model_class(MODEL_NAME)
model = model_class(len(mappings))
model.to(device)
saving_path = f'models/{MODEL_NAME}/checkpoint_{DATASET_VAR}.pth'
checkpoint = torch.load(saving_path)
model.load_state_dict(checkpoint['model_state_dict'])

<All keys matched successfully>

# Custom Thresholds Creation

In [29]:
from collections import defaultdict
from sklearn.metrics import f1_score

def calculate_conf_scores(valid_loader, model, mappings):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.eval()
    model.to(device)
    
    conf_scores = defaultdict(list)

    with torch.no_grad():
        for mel_spec, _, file_path in valid_loader:
            mel_spec = mel_spec.to(device)

            # Estraggo la specie corretta dal path
            correct_species = file_path[0].split("/")[-2]
            outputs = model(mel_spec)
            probs = torch.sigmoid(outputs)[0].cpu().numpy()

            for i, prob in enumerate(probs):
                species_name = list(mappings.keys())[i]
                is_correct = species_name == correct_species
                conf_scores[species_name].append((prob, is_correct))

    return conf_scores

In [30]:
def compute_best_thresholds(conf_scores, num_thresholds=100, min_thresh=0.05, max_thresh=0.95):
    thresholds = {}

    for species, values in conf_scores.items():
        probs, truths = zip(*values)
        probs = np.array(probs)
        truths = np.array(truths).astype(int)

        best_thresh = 0.1
        best_f1 = 0.0

        for thresh in np.linspace(min_thresh, max_thresh, num_thresholds):
            preds = (probs >= thresh).astype(int)
            f1 = f1_score(truths, preds, zero_division=0)
            if f1 > best_f1:
                best_f1 = f1
                best_thresh = thresh

        thresholds[species] = best_thresh
        print(f"📊 {species} -> {best_thresh:.3f}, F1-score: {best_f1:.3f}")

    return thresholds



In [31]:
def compute_distribution_based_thresholds(conf_scores, min_thresh=0.05, max_thresh=0.95):
    thresholds = {}

    for species, values in conf_scores.items():
        probs, truths = zip(*values)
        probs = np.array(probs)
        truths = np.array(truths).astype(int)

        correct_probs = np.array([p for p, t in zip(probs, truths) if t == 1])

        if len(correct_probs) == 0:
            thresholds[species] = 0.2  # fallback
            print(f"⚠️ Specie: {species}, nessuna predizione corretta trovata. Soglia fallback: 0.5")
            continue

        mean_conf = np.mean(probs)
        std_conf = np.std(probs)
        threshold = np.clip(mean_conf - 2 * std_conf, min_thresh, max_thresh)

        thresholds[species] = threshold
        print(f"📊 {species} -> {threshold:.3f}, (mean={mean_conf:.3f}, std={std_conf}), samples: {len(correct_probs)}")

    return thresholds


In [32]:
valid_loader = utils.get_dataloader(dataset_config, split="valid", batch_size=1)
conf_scores = calculate_conf_scores(valid_loader, model, dataset_config["mappings"])
best_thresholds = compute_best_thresholds(conf_scores)
# best_thresholds = compute_distribution_based_thresholds(conf_scores)

📊 Aeroplane -> 0.914, F1-score: 0.872
📊 Muscicapa striata_Spotted Flycatcher -> 0.186, F1-score: 0.780
📊 Periparus ater_Coal Tit -> 0.950, F1-score: 0.369
📊 Cuculus canorus_Common Cuckoo -> 0.950, F1-score: 0.500
📊 Regulus regulus_Goldcrest -> 0.050, F1-score: 0.276
📊 Anthus trivialis_Tree Pipit -> 0.923, F1-score: 0.297
📊 Vegetation -> 0.623, F1-score: 0.207
📊 Troglodytes troglodytes_Eurasian Wren -> 0.386, F1-score: 0.443
📊 Erithacus rubecula_European Robin -> 0.532, F1-score: 0.314
📊 None -> 0.059, F1-score: 0.007
📊 Parus major_Great Tit -> 0.795, F1-score: 0.386
📊 Certhia familiaris_Eurasian Treecreeper -> 0.950, F1-score: 0.243
📊 Phylloscopus collybita_Common Chiffchaff -> 0.877, F1-score: 0.637
📊 Coccothraustes coccothraustes_Hawfinch -> 0.223, F1-score: 0.154
📊 Wind -> 0.077, F1-score: 0.474
📊 Turdus merula_Eurasian Blackbird -> 0.950, F1-score: 0.335
📊 Loxia curvirostra_Common Crossbill -> 0.114, F1-score: 0.215
📊 Regulus ignicapilla_Common Firecrest -> 0.132, F1-score: 0.830
📊

# Test Model

In [33]:
from sklearn.metrics import average_precision_score, precision_recall_fscore_support

def compute_samplewise_mAP(y_true, y_probs):
    """
    Calcola la sample-wise mAP (media delle AP per ogni sample).
    """
    ap_per_sample = []
    for i in range(y_true.shape[0]):
        if np.sum(y_true[i]) == 0:
            continue  # Evita sample senza label positive
        ap = average_precision_score(y_true[i], y_probs[i])
        ap_per_sample.append(ap)
    return np.mean(ap_per_sample)

def compute_classwise_mAP(y_true, y_probs):
    """
    Calcola la class-wise mAP (media delle AP per ogni classe).
    """
    ap_per_class = []
    for i in range(y_true.shape[1]):
        if np.sum(y_true[:, i]) == 0:
            continue  # Evita classi mai presenti
        ap = average_precision_score(y_true[:, i], y_probs[:, i])
        ap_per_class.append(ap)
    return np.mean(ap_per_class)

def compute_f05(y_true, y_pred):
    _, _, f05, _ = precision_recall_fscore_support(
        y_true, y_pred, beta=0.5, average='macro', zero_division=0
    )
    return f05

In [34]:
inverse_mappings = {value: key for key, value in mappings.items()}

In [35]:
import numpy as np
import csv
import os

def test_model(model, dataset_config, batch_size=1, thresholds=0.5):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"\n🧬 Advanced testing on: {device}")
    test_pred_segments = {}

    test_loader = utils.get_dataloader(dataset_config, split="test", batch_size=batch_size, shuffle=False)
    model.eval()
    criterion = nn.BCEWithLogitsLoss()
    class_names = list(dataset_config['mappings'].keys())
    total_loss = 0.0
    all_preds = []
    all_probs = []
    all_labels = []

    use_custom_threshold = isinstance(thresholds, dict)

    with torch.no_grad():
        for mel_spec, labels, file_path in test_loader:
            basename = os.path.splitext(file_path[0].split("/")[-1])[0]
            date, time, segm1, segm2 = basename.split("_")
            audio_name = "_".join([date, time]) + ".WAV"
            segm = "_".join([segm1, segm2])
            test_pred_segments.setdefault(audio_name, {})
            test_pred_segments[audio_name].setdefault(segm, {})

            mel_spec = mel_spec.to(device)
            labels = labels.to(device)

            outputs = model(mel_spec)
            loss = criterion(outputs, labels)
            total_loss += loss.item()

            probs = torch.sigmoid(outputs)

            if use_custom_threshold:
                batch_preds = torch.zeros_like(probs)
                for i, class_name in enumerate(class_names):
                    thresh = thresholds.get(class_name, 0.5)
                    batch_preds[:, i] = (probs[:, i] > thresh).float()
            else:
                batch_preds = (probs > thresholds).float()

            correct_probs = probs * batch_preds
            conf_scores = {
                inverse_mappings[i]: correct_probs[0, i].item()
                for i in range(correct_probs.size(1))
                if correct_probs[0, i].item() != 0
            }
            test_pred_segments[audio_name][segm].update(conf_scores)

            all_probs.append(probs.cpu())
            all_preds.append(batch_preds.cpu())
            all_labels.append(labels.cpu())

    avg_loss = total_loss / len(test_loader)
    all_probs = torch.cat(all_probs).numpy()
    all_preds = torch.cat(all_preds).numpy()
    all_labels = torch.cat(all_labels).numpy()

    samplewise_map = compute_samplewise_mAP(all_labels, all_probs)  # chiamata mAP
    classwise_map = compute_classwise_mAP(all_labels, all_probs)    # chiamata cmAP
    f05_score = compute_f05(all_labels, all_preds)

    with open(f"models/{MODEL_NAME}/metrics_output.csv", mode="w", newline="") as f:
        writer = csv.writer(f)
        writer.writerow(["Metric", "Value"])
        writer.writerow(["mAP (sample-wise)", samplewise_map])
        writer.writerow(["cmAP (class-wise)", classwise_map])
        writer.writerow(["F0.5 Score", f05_score])

    return avg_loss, all_labels, all_preds, samplewise_map, classwise_map, f05_score, test_pred_segments

In [36]:
avg_loss, all_labels, all_preds, samplewise_map, classwise_map, f05_score, test_pred_segments = test_model(model, dataset_config, thresholds=best_thresholds)


🧬 Advanced testing on: cuda


In [37]:
# WHERE "NONE" REMOVE ALL THE OTHERS
none_index = mappings["None"]
# all indices where preds[none_index] == 1
mask = (all_preds[:, none_index] == 1)
all_preds[mask] = 0
all_preds[mask, none_index] = 1
all_preds

array([[0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [38]:
torch.cuda.empty_cache()

In [39]:
from sklearn.metrics import classification_report

class_names = list(dataset_config['mappings'].keys())
clf_report = classification_report(all_labels, all_preds, target_names=class_names, output_dict=True, zero_division=0)

In [40]:
os.makedirs(f"{MODEL_PATH}/{DATASET_VAR}", exist_ok=True)
with open(f"{MODEL_PATH}/{DATASET_VAR}/test_pred_segments.json", "w") as f:
    json.dump(test_pred_segments, f)

In [41]:
print("mAP Score: ", samplewise_map)
print("mcAP Score: ", classwise_map)
print("F0.5 Score: ", f05_score)

mAP Score:  0.7034713422880997
mcAP Score:  0.3456516536938584
F0.5 Score:  0.3782775757341676


In [42]:
from io import StringIO
import pandas as pd

clf_report_df = pd.read_json(StringIO(json.dumps(clf_report)), orient='index')
clf_report_df

Unnamed: 0,precision,recall,f1-score,support
Aeroplane,0.0,0.0,0.0,22
Muscicapa striata_Spotted Flycatcher,0.0,0.0,0.0,173
Periparus ater_Coal Tit,0.595238,0.230415,0.332226,217
Cuculus canorus_Common Cuckoo,0.0,0.0,0.0,3
Regulus regulus_Goldcrest,0.58209,0.286765,0.384236,136
Anthus trivialis_Tree Pipit,0.905109,0.649215,0.756098,191
Vegetation,0.169231,0.203704,0.184874,54
Troglodytes troglodytes_Eurasian Wren,0.5,0.308057,0.381232,211
Erithacus rubecula_European Robin,0.502058,0.176812,0.261522,690
,0.764162,0.846893,0.803403,4683


In [43]:
with open(f"{MODEL_PATH}/classification_report_{DATASET_VAR}.json", "w") as f:
    json.dump(clf_report, f)

In [44]:
torch.cuda.empty_cache()