# Setup

In [6]:
import torch
import numpy as np
import json
import torch.nn as nn
from birdlib import utils

In [7]:
# sudo modprobe nvidia_uvm
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [8]:
DATASET_NAME = "dataset"
MODEL_NAME = 'VanillaCNN'
DATASET_VAR = 'augm_final'

In [9]:
DATASET_PATH = f'../segments/{DATASET_NAME}'
TRAIN_PATH = f"{DATASET_PATH}/train"
TEST_PATH = f"{DATASET_PATH}/test"
MODEL_PATH = f'./models/{MODEL_NAME}'

# Load the model

In [10]:
with open(f"./utils/{DATASET_NAME}/dataset_config_{DATASET_VAR}.json") as f:
    dataset_config = json.load(f)

In [11]:
test_loader = utils.get_dataloader(dataset_config, split="test", batch_size=1, shuffle=False)
valid_loader = utils.get_dataloader(dataset_config, split="valid", batch_size=1)


In [12]:
mappings = dataset_config["mappings"]

In [13]:
config_file = '/home/giacomoschiavo/finetuning-BirdNET/configs/configs_1.json'
with open(config_file) as f:
    config = json.load(f)

In [16]:
model_class = utils.load_model_class(MODEL_NAME)
input_shape = (256, 256)
model_class = utils.load_model_class(MODEL_NAME)
model = model_class(len(mappings))
model.to(device)
saving_path = 'models/VanillaCNN/augm_final/checkpoint_augm_final.pth'
checkpoint = torch.load(saving_path)
model.load_state_dict(checkpoint['model_state_dict'])

<All keys matched successfully>

# Custom Thresholds Creation

In [17]:
from collections import defaultdict
from sklearn.metrics import f1_score

def calculate_conf_scores(valid_loader, model, mappings):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.eval()
    model.to(device)
    
    conf_scores = defaultdict(list)

    with torch.no_grad():
        for mel_spec, _, file_path in valid_loader:
            mel_spec = mel_spec.to(device)

            # Estraggo la specie corretta dal path
            correct_species = file_path[0].split("/")[-2]
            outputs = model(mel_spec)
            probs = torch.sigmoid(outputs)[0].cpu().numpy()

            for i, prob in enumerate(probs):
                species_name = list(mappings.keys())[i]
                is_correct = species_name == correct_species
                conf_scores[species_name].append((prob, is_correct))

    return conf_scores

In [18]:
def compute_best_thresholds(conf_scores, num_thresholds=200, min_thresh=0.01, max_thresh=0.95):
    thresholds = {}

    for species, values in conf_scores.items():
        probs, truths = zip(*values)
        probs = np.array(probs)
        truths = np.array(truths).astype(int)

        best_thresh = 0.15
        best_f1 = 0.0

        for thresh in np.linspace(min_thresh, max_thresh, num_thresholds):
            preds = (probs >= thresh).astype(int)
            f1 = f1_score(truths, preds, zero_division=0)
            if f1 > best_f1:
                best_f1 = f1
                best_thresh = thresh

        thresholds[species] = best_thresh
        print(f"📊 {species} -> {best_thresh:.3f}, F1-score: {best_f1:.3f}")

    return thresholds



In [19]:
conf_scores = calculate_conf_scores(valid_loader, model, dataset_config["mappings"])
best_thresholds = compute_best_thresholds(conf_scores)
# best_thresholds = compute_distribution_based_thresholds(conf_scores)

📊 Aeroplane -> 0.053, F1-score: 0.804
📊 Muscicapa striata_Spotted Flycatcher -> 0.865, F1-score: 0.743
📊 Periparus ater_Coal Tit -> 0.950, F1-score: 0.453
📊 Regulus regulus_Goldcrest -> 0.010, F1-score: 0.429
📊 Anthus trivialis_Tree Pipit -> 0.128, F1-score: 0.201
📊 Vegetation -> 0.010, F1-score: 0.595
📊 Troglodytes troglodytes_Eurasian Wren -> 0.851, F1-score: 0.372
📊 Erithacus rubecula_European Robin -> 0.936, F1-score: 0.440
📊 None -> 0.019, F1-score: 0.346
📊 Parus major_Great Tit -> 0.936, F1-score: 0.245
📊 Certhia familiaris_Eurasian Treecreeper -> 0.917, F1-score: 0.480
📊 Phylloscopus collybita_Common Chiffchaff -> 0.860, F1-score: 0.643
📊 Coccothraustes coccothraustes_Hawfinch -> 0.015, F1-score: 0.051
📊 Wind -> 0.090, F1-score: 0.667
📊 Turdus merula_Eurasian Blackbird -> 0.780, F1-score: 0.324
📊 Loxia curvirostra_Common Crossbill -> 0.015, F1-score: 0.294
📊 Regulus ignicapilla_Common Firecrest -> 0.879, F1-score: 0.851
📊 Sylvia atricapilla_Eurasian Blackcap -> 0.941, F1-score: 

# Test Model

In [20]:
from sklearn.metrics import average_precision_score, precision_recall_fscore_support

def compute_samplewise_mAP(y_true, y_probs):
    """
    Calcola la sample-wise mAP (media delle AP per ogni sample).
    """
    ap_per_sample = []
    for i in range(y_true.shape[0]):
        if np.sum(y_true[i]) == 0:
            continue  # Evita sample senza label positive
        ap = average_precision_score(y_true[i], y_probs[i])
        ap_per_sample.append(ap)
    return np.mean(ap_per_sample)

def compute_classwise_mAP(y_true, y_probs):
    """
    Calcola la class-wise mAP (media delle AP per ogni classe).
    """
    ap_per_class = []
    for i in range(y_true.shape[1]):
        if np.sum(y_true[:, i]) == 0:
            continue  # Evita classi mai presenti
        ap = average_precision_score(y_true[:, i], y_probs[:, i])
        ap_per_class.append(ap)
    return np.mean(ap_per_class)

def compute_f05(y_true, y_pred):
    _, _, f05, _ = precision_recall_fscore_support(
        y_true, y_pred, beta=0.5, average='macro', zero_division=0
    )
    return f05

In [21]:
inverse_mappings = {value: key for key, value in mappings.items()}

In [None]:
import os

def test_model(model, dataset_config, test_loader, thresholds=0.2):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"\n🧬 Advanced testing on: {device}")
    test_pred_segments = {}

    model.eval()
    criterion = nn.BCEWithLogitsLoss()
    class_names = list(dataset_config['mappings'].keys())
    total_loss = 0.0
    all_preds = []
    all_probs = []
    all_labels = []

    use_custom_threshold = isinstance(thresholds, dict)

    with torch.no_grad():
        for mel_spec, labels, file_path in test_loader:
            basename = os.path.splitext(file_path[0].split("/")[-1])[0]
            date, time, segm1, segm2 = basename.split("_")
            audio_name = "_".join([date, time]) + ".WAV"
            segm = "_".join([segm1, segm2])
            test_pred_segments.setdefault(audio_name, {})

            mel_spec = mel_spec.to(device)
            labels = labels.to(device)

            outputs = model(mel_spec)
            loss = criterion(outputs, labels)
            total_loss += loss.item()

            probs = torch.sigmoid(outputs)

            if use_custom_threshold:
                batch_preds = torch.zeros_like(probs)
                for i, class_name in enumerate(class_names):
                    thresh = thresholds.get(class_name, 0.5)
                    batch_preds[:, i] = (probs[:, i] > thresh).float()
            else:
                batch_preds = (probs > thresholds).float()

            correct_probs = probs * batch_preds
            # already_added = False
            if segm not in test_pred_segments:
                test_pred_segments[audio_name][segm] = {}
            # else:
            #     already_added = True
                
            conf_scores = {
                inverse_mappings[i]: correct_probs[0, i].item()
                for i in range(correct_probs.size(1))
                if correct_probs[0, i].item() != 0
            }
            test_pred_segments[audio_name][segm].update(conf_scores)

            # if not already_added:
            #     all_probs.append(probs.cpu())
            #     all_preds.append(batch_preds.cpu())
            #     all_labels.append(labels.cpu())

    avg_loss = total_loss / len(test_loader)
    # all_probs = torch.cat(all_probs).numpy()
    # all_preds = torch.cat(all_preds).numpy()
    # all_labels = torch.cat(all_labels).numpy()

    # samplewise_map = compute_samplewise_mAP(all_labels, all_probs)  # chiamata mAP
    # classwise_map = compute_classwise_mAP(all_labels, all_probs)    # chiamata cmAP
    # f05_score = compute_f05(all_labels, all_preds)

    # with open(f"models/{MODEL_NAME}/{DATASET_VAR}/metrics_output.csv", mode="w", newline="") as f:
    #     writer = csv.writer(f)
    #     writer.writerow(["Metric", "Value"])
    #     writer.writerow(["mAP (sample-wise)", samplewise_map])
    #     writer.writerow(["cmAP (class-wise)", classwise_map])
    #     writer.writerow(["F0.5 Score", f05_score])

    # return avg_loss, all_labels, all_preds, all_probs, test_pred_segments
    return avg_loss, test_pred_segments

In [23]:
avg_loss, test_pred_segments = test_model(model, dataset_config, test_loader, thresholds=best_thresholds)


🧬 Advanced testing on: cuda


In [24]:
from collections import defaultdict
import os

def get_true_segments(test_path):
    test_species_list = os.listdir(test_path)
    true_segments = defaultdict(dict)
    for species in test_species_list:
        for audio in os.listdir(os.path.join(test_path, species)):
            audio = audio.split('.')[0]
            date, time, segm1, segm2 = audio.split('_')
            audio_name = '_'.join([date, time]) + '.WAV'
            segm = '_'.join([segm1, segm2])
            if segm not in true_segments[audio_name]:
                true_segments[audio_name][segm] = []
            true_segments[audio_name][segm].extend([species])
    return true_segments

In [25]:
true_segments = get_true_segments(TEST_PATH)

In [26]:
def get_pred_proba_segments(test_pred_segments):
    pred_segments = {}
    pred_proba = {}

    for audio, segments in test_pred_segments.items():
        pred_segments.setdefault(audio, {})
        pred_proba.setdefault(audio, {})
        for segm, labels in segments.items():
            pred_segments[audio].setdefault(segm, {})
            pred_segments[audio][segm] = list(labels.keys())
            pred_proba[audio].setdefault(segm, {})
            pred_proba[audio][segm] = list(labels.values())
    return pred_segments, pred_proba

In [37]:
# extract recognized labels
pred_segments, pred_proba = get_pred_proba_segments(test_pred_segments)

In [47]:
def fill_pred_segments(true_segments, pred_segments, pred_proba):
    for audio in true_segments.keys():
        if audio in pred_segments:
            for segm in true_segments[audio].keys():
                if segm not in pred_segments[audio]:
                    pred_segments[audio][segm] = {}
                    pred_proba[audio][segm] = {}

    return pred_segments, pred_proba

In [48]:
pred_segments, pred_proba = fill_pred_segments(true_segments, pred_segments, pred_proba)

In [49]:
from sklearn.preprocessing import MultiLabelBinarizer

test_species_list = os.listdir(TEST_PATH)
# test_species_list = [species for species in test_species_list if len(species.split('_')) > 1]
mlb = MultiLabelBinarizer()
mlb.fit([test_species_list])
if DATASET_VAR == "orig":
    class_names = [species for species in test_species_list if len(species.split("_")) > 1]
    mlb.fit([class_names])

len(mlb.classes_)

20

In [None]:
def binarize_test_segments(mlb, true_segments, pred_segments, pred_proba):
    y_pred = []
    y_true = []
    y_pred_proba = []

    for audio in pred_segments:
        for segment in sorted(pred_segments[audio].keys()):
            true_labels = true_segments[audio].get(segment, [])
            pred_labels = pred_segments[audio].get(segment, [])
            proba_values = pred_proba[audio].get(segment, [])

            y_true_vec = mlb.transform([true_labels])[0]  # 1D array
            y_pred_vec = mlb.transform([pred_labels])[0]  # 1D array

            proba_vec = np.zeros(len(mlb.classes_))
            for label, score in zip(pred_labels, proba_values):
                if label in mlb.classes_:
                    idx = list(mlb.classes_).index(label)
                    proba_vec[idx] = score

            y_true.append(y_true_vec)
            y_pred.append(y_pred_vec)
            y_pred_proba.append(proba_vec)

    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    y_pred_proba = np.array(y_pred_proba)

    return y_true, y_pred, y_pred_proba


In [None]:
y_true, y_pred, y_pred_proba = binarize_test_segments(mlb, true_segments, pred_segments, pred_proba)

In [52]:
from sklearn.metrics import classification_report
import pandas as pd
report = classification_report(y_true, y_pred, target_names=mlb.classes_, zero_division=0, output_dict=True)

report_df = pd.DataFrame(report).T
report_df

Unnamed: 0,precision,recall,f1-score,support
Aeroplane,0.1,0.045455,0.0625,22.0
Anthus trivialis_Tree Pipit,0.405882,0.413174,0.409496,167.0
Certhia familiaris_Eurasian Treecreeper,0.6,0.531646,0.563758,79.0
Coccothraustes coccothraustes_Hawfinch,0.0,0.0,0.0,87.0
Erithacus rubecula_European Robin,0.219512,0.080935,0.118265,556.0
Fringilla coelebs_Common Chaffinch,0.401274,0.540386,0.460554,1399.0
Lophophanes cristatus_Crested Tit,0.0,0.0,0.0,23.0
Loxia curvirostra_Common Crossbill,0.152542,0.439024,0.226415,41.0
Muscicapa striata_Spotted Flycatcher,0.0,0.0,0.0,173.0
,0.794983,0.66325,0.723166,4683.0


In [None]:
report["micro avg"], report["weighted avg"], report["samples avg"]

({'precision': 0.4396110989731265,
  'recall': 0.506563615177336,
  'f1-score': 0.45398004515330276,
  'support': 10137},
 {'precision': 0.48932401849847484,
  'recall': 0.49057906678504487,
  'f1-score': 0.48995073891625607,
  'support': 10137},
 {'precision': 0.5831679401577161,
  'recall': 0.49057906678504487,
  'f1-score': 0.5134722025271634,
  'support': 10137})

In [53]:
torch.cuda.empty_cache()

In [54]:
os.makedirs(f"{MODEL_PATH}/{DATASET_VAR}", exist_ok=True)
with open(f"{MODEL_PATH}/{DATASET_VAR}/test_pred_segments.json", "w") as f:
    json.dump(test_pred_segments, f)

In [55]:
np.savez(f'{MODEL_PATH}/{DATASET_VAR}/results.npz', y_true=y_true, y_pred=y_pred, y_pred_proba=y_pred_proba, class_names=mlb.classes_)
