# Final Test Showdown
Testing on segments files in Bird_tags_Test.mat

In [67]:
import matplotlib.pyplot as plt
import torch
import torch.optim as optim
import numpy as np
import json
import torch.nn as nn
import os
import utils

In [68]:
# sudo modprobe nvidia_uvm
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [69]:
DATASET_NAME = "DATASET_CNN"
MODEL_NAME = 'DeeperCNN'

In [70]:
DATASET_PATH = f'../segments/{DATASET_NAME}'
FINAL_TEST_PATH = f"{DATASET_PATH}/final_test"
TEST_PATH = f"{DATASET_PATH}/test"
TRAIN_PATH = f"{DATASET_PATH}/train"
MODEL_PATH = f'./models/{MODEL_NAME}'

# Load the model

In [71]:
with open(f"utils/{DATASET_NAME}/dataset_config.json") as f:
    dataset_config = json.load(f)

In [None]:
def create_dataset_config(dataset_name):
    # per ora dataset_config_1 contiene anche final_test
    # tengo momentaneamente cosi poi sarebbe da cambiare
    saving_path = f"utils/{dataset_name}/dataset_config.json"
    if os.path.exists(saving_path):
        print("Dataset config already created!")
        with open(saving_path) as f:
            return json.load(f)

    mappings = utils.get_mappings(TEST_PATH)
    samples = utils.collect_samples(TRAIN_PATH, TEST_PATH, FINAL_TEST_PATH, mappings)

    dataset_config = {
        "mappings": mappings,
        "samples": samples
    }
    with open(saving_path, "w") as f:
        json.dump(dataset_config, f)
    print("Saved new dataset config")
    return dataset_config

In [90]:
dataset_config = create_dataset_config(DATASET_NAME)

Dataset config already created!


In [91]:
mappings = dataset_config["mappings"]

In [92]:
model_class = utils.load_model_class(MODEL_NAME)
model = model_class(len(mappings))
model.to(device)
saving_path = f'models/{MODEL_NAME}/checkpoint.pth'
checkpoint = torch.load(saving_path)
model.load_state_dict(checkpoint['model_state_dict'])

<All keys matched successfully>

# Specs generation

In [116]:
SPECS_FINAL_TEST_PATH = f"{DATASET_PATH}/final_test_specs"
os.makedirs(SPECS_FINAL_TEST_PATH, exist_ok=True)
utils.specs_generation(FINAL_TEST_PATH, SPECS_FINAL_TEST_PATH, dataset_config['mappings'])

Processing: Muscicapa striata_Spotted Flycatcher
Processing: Periparus ater_Coal Tit
Processing: Regulus regulus_Goldcrest
Processing: Troglodytes troglodytes_Eurasian Wren
Processing: Erithacus rubecula_European Robin
Processing: Certhia familiaris_Eurasian Treecreeper
Processing: Turdus merula_Eurasian Blackbird
Processing: Loxia curvirostra_Common Crossbill
Processing: Regulus ignicapilla_Common Firecrest
Processing: Sylvia atricapilla_Eurasian Blackcap
Processing: Lophophanes cristatus_Crested Tit
Processing: Fringilla coelebs_Common Chaffinch


# Custom Thresholds Creation

In [117]:
from collections import defaultdict
from sklearn.metrics import f1_score

def calculate_conf_scores(valid_loader, model, mappings):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.eval()
    model.to(device)
    
    conf_scores = defaultdict(list)

    with torch.no_grad():
        for mel_spec, _, file_path in valid_loader:
            mel_spec = mel_spec.to(device)

            # Estraggo la specie corretta dal path
            correct_species = file_path[0].split("/")[-2]
            outputs = model(mel_spec)
            probs = torch.sigmoid(outputs)[0].cpu().numpy()

            for i, prob in enumerate(probs):
                species_name = list(mappings.keys())[i]
                is_correct = species_name == correct_species
                conf_scores[species_name].append((prob, is_correct))

    return conf_scores

In [118]:
def compute_best_thresholds(conf_scores, num_thresholds=100, min_thresh=0.05, max_thresh=0.95):
    thresholds = {}

    for species, values in conf_scores.items():
        probs, truths = zip(*values)
        probs = np.array(probs)
        truths = np.array(truths).astype(int)

        best_thresh = 0.5
        best_f1 = 0.0

        for thresh in np.linspace(min_thresh, max_thresh, num_thresholds):
            preds = (probs >= thresh).astype(int)
            f1 = f1_score(truths, preds, zero_division=0)
            if f1 > best_f1:
                best_f1 = f1
                best_thresh = thresh

        thresholds[species] = best_thresh
        print(f"📊 {species} -> {best_thresh:.3f}, F1-score: {best_f1:.3f}")

    return thresholds



In [119]:
def compute_distribution_based_thresholds(conf_scores, min_thresh=0.05, max_thresh=0.95):
    thresholds = {}

    for species, values in conf_scores.items():
        probs, truths = zip(*values)
        probs = np.array(probs)
        truths = np.array(truths).astype(int)

        correct_probs = np.array([p for p, t in zip(probs, truths) if t == 1])

        if len(correct_probs) == 0:
            thresholds[species] = 0.5  # fallback
            print(f"⚠️ Specie: {species}, nessuna predizione corretta trovata. Soglia fallback: 0.5")
            continue

        mean_conf = np.mean(probs)
        std_conf = np.std(probs)
        threshold = np.clip(mean_conf - 2 * std_conf, min_thresh, max_thresh)

        thresholds[species] = threshold
        print(f"📊 {species} -> {threshold:.3f}, (mean={mean_conf:.3f}, std={std_conf}), samples: {len(correct_probs)}")

    return thresholds


In [120]:
valid_loader = utils.get_dataloader(dataset_config, split="valid", batch_size=1)
conf_scores = calculate_conf_scores(valid_loader, model, dataset_config["mappings"])
best_thresholds = compute_best_thresholds(conf_scores)
# best_thresholds = compute_distribution_based_thresholds(conf_scores)

📊 Regulus ignicapilla_Common Firecrest -> 0.395, F1-score: 0.629
📊 Sylvia atricapilla_Eurasian Blackcap -> 0.768, F1-score: 0.780
📊 Fringilla coelebs_Common Chaffinch -> 0.623, F1-score: 0.588
📊 Troglodytes troglodytes_Eurasian Wren -> 0.232, F1-score: 0.664
📊 Muscicapa striata_Spotted Flycatcher -> 0.350, F1-score: 0.840
📊 Glaucidium passerinum_Eurasian Pygmy-Owl -> 0.377, F1-score: 0.800
📊 Pyrrhula pyrrhula_Eurasian Bullfinch -> 0.659, F1-score: 0.400
📊 Periparus ater_Coal Tit -> 0.386, F1-score: 0.702
📊 Lophophanes cristatus_Crested Tit -> 0.105, F1-score: 0.608
📊 Regulus regulus_Goldcrest -> 0.895, F1-score: 0.854
📊 Turdus merula_Eurasian Blackbird -> 0.732, F1-score: 0.746
📊 Certhia familiaris_Eurasian Treecreeper -> 0.614, F1-score: 0.638
📊 Erithacus rubecula_European Robin -> 0.595, F1-score: 0.690
📊 Turdus philomelos_Song Thrush -> 0.877, F1-score: 0.512
📊 Loxia curvirostra_Common Crossbill -> 0.050, F1-score: 0.963
📊 Dendrocopos major_Great Spotted Woodpecker -> 0.186, F1-scor

# Test Model

In [121]:
from sklearn.metrics import average_precision_score, precision_recall_fscore_support

def compute_samplewise_mAP(y_true, y_probs):
    """
    Calcola la sample-wise mAP (media delle AP per ogni sample).
    """
    ap_per_sample = []
    for i in range(y_true.shape[0]):
        if np.sum(y_true[i]) == 0:
            continue  # Evita sample senza label positive
        ap = average_precision_score(y_true[i], y_probs[i])
        ap_per_sample.append(ap)
    return np.mean(ap_per_sample)

def compute_classwise_mAP(y_true, y_probs):
    """
    Calcola la class-wise mAP (media delle AP per ogni classe).
    """
    ap_per_class = []
    for i in range(y_true.shape[1]):
        if np.sum(y_true[:, i]) == 0:
            continue  # Evita classi mai presenti
        ap = average_precision_score(y_true[:, i], y_probs[:, i])
        ap_per_class.append(ap)
    return np.mean(ap_per_class)

def compute_f05(y_true, y_pred):
    _, _, f05, _ = precision_recall_fscore_support(
        y_true, y_pred, beta=0.5, average='macro', zero_division=0
    )
    return f05

In [122]:
inverse_mappings = {value: key for key, value in mappings.items()}

In [123]:
import numpy as np
from sklearn.metrics import classification_report
import csv
import os

def test_model(model, dataset_config, batch_size=100, thresholds=0.5):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"\n🧬 Advanced testing on: {device}")
    test_pred_segments = {}

    test_loader = utils.get_dataloader(dataset_config, split="final_test", batch_size=batch_size, shuffle=False)
    model.eval()
    criterion = nn.BCEWithLogitsLoss()
    class_names = list(dataset_config['mappings'].keys())
    total_loss = 0.0
    all_preds = []
    all_probs = []
    all_labels = []

    use_custom_threshold = isinstance(thresholds, dict)

    with torch.no_grad():
        for mel_spec, labels, file_path in test_loader:

            # basename = os.path.splitext(file_path[0].split("/")[-1])[0]
            # date, time, segm1, segm2 = basename.split("_")
            # audio_name = "_".join([date, time]) + ".WAV"
            # segm = "_".join([segm1, segm2])
            # test_pred_segments.setdefault(audio_name, {})
            # test_pred_segments[audio_name].setdefault(segm, {})

            mel_spec = mel_spec.to(device)
            labels = labels.to(device)

            outputs = model(mel_spec)
            loss = criterion(outputs, labels)
            total_loss += loss.item()

            probs = torch.sigmoid(outputs)

            if use_custom_threshold:
                batch_preds = torch.zeros_like(probs)
                for i, class_name in enumerate(class_names):
                    thresh = thresholds.get(class_name, 0.5)
                    batch_preds[:, i] = (probs[:, i] > thresh).float()
            else:
                batch_preds = (probs > thresholds).float()

            # correct_probs = probs * batch_preds
            # conf_scores = {
            #     inverse_mappings[i]: correct_probs[0, i].item()
            #     for i in range(correct_probs.size(1))
            #     if correct_probs[0, i].item() != 0
            # }
            # test_pred_segments[audio_name][segm].update(conf_scores)

            all_probs.append(probs.cpu())
            all_preds.append(batch_preds.cpu())
            all_labels.append(labels.cpu())

    avg_loss = total_loss / len(test_loader)
    all_probs = torch.cat(all_probs).numpy()
    all_preds = torch.cat(all_preds).numpy()
    all_labels = torch.cat(all_labels).numpy()

    samplewise_map = compute_samplewise_mAP(all_labels, all_probs)  # chiamata mAP
    classwise_map = compute_classwise_mAP(all_labels, all_probs)    # chiamata cmAP
    f05_score = compute_f05(all_labels, all_preds)

    with open(f"models/{MODEL_NAME}/metrics_output.csv", mode="w", newline="") as f:
        writer = csv.writer(f)
        writer.writerow(["Metric", "Value"])
        writer.writerow(["mAP (sample-wise)", samplewise_map])
        writer.writerow(["cmAP (class-wise)", classwise_map])
        writer.writerow(["F0.5 Score", f05_score])

    # 👇 Report
    clf_report = classification_report(all_labels, all_preds, target_names=class_names, output_dict=True, zero_division=0)
    return avg_loss, clf_report, samplewise_map, classwise_map, f05_score, test_pred_segments

In [124]:
a = [sample for sample in dataset_config["samples"] if sample["split"] == "final_test"]
a

[{'file_path': '../segments/DATASET_CNN/final_test/Muscicapa striata_Spotted Flycatcher/20190601_090000_591_0.wav',
  'split': 'final_test',
  'labels': [4]},
 {'file_path': '../segments/DATASET_CNN/final_test/Muscicapa striata_Spotted Flycatcher/20190601_090000_438_0.wav',
  'split': 'final_test',
  'labels': [4]},
 {'file_path': '../segments/DATASET_CNN/final_test/Muscicapa striata_Spotted Flycatcher/20190601_090000_393_0.wav',
  'split': 'final_test',
  'labels': [4]},
 {'file_path': '../segments/DATASET_CNN/final_test/Muscicapa striata_Spotted Flycatcher/20190601_090000_585_0.wav',
  'split': 'final_test',
  'labels': [4]},
 {'file_path': '../segments/DATASET_CNN/final_test/Muscicapa striata_Spotted Flycatcher/20190601_090000_223_5.wav',
  'split': 'final_test',
  'labels': [4]},
 {'file_path': '../segments/DATASET_CNN/final_test/Muscicapa striata_Spotted Flycatcher/20190601_090000_594_0.wav',
  'split': 'final_test',
  'labels': [4]},
 {'file_path': '../segments/DATASET_CNN/final_

In [125]:
avg_loss, clf_report, samplewise_map, classwise_map, f05_score, test_pred_segments = test_model(model, dataset_config, thresholds=best_thresholds)


🧬 Advanced testing on: cuda


In [126]:
print("mAP Score: ", samplewise_map)
print("mcAP Score: ", classwise_map)
print("F0.5 Score: ", f05_score)

mAP Score:  0.5222335607593631
mcAP Score:  0.20881649311088749
F0.5 Score:  0.11856008884873939


In [127]:
from io import StringIO
import pandas as pd

clf_report_df = pd.read_json(StringIO(json.dumps(clf_report)), orient='index')
clf_report_df

Unnamed: 0,precision,recall,f1-score,support
Regulus ignicapilla_Common Firecrest,0.441489,0.348739,0.389671,238
Sylvia atricapilla_Eurasian Blackcap,0.191702,0.815416,0.310425,493
Fringilla coelebs_Common Chaffinch,0.654088,0.45283,0.535163,1378
Troglodytes troglodytes_Eurasian Wren,0.112245,0.297297,0.162963,111
Muscicapa striata_Spotted Flycatcher,0.048193,0.023121,0.03125,173
Glaucidium passerinum_Eurasian Pygmy-Owl,0.0,0.0,0.0,0
Pyrrhula pyrrhula_Eurasian Bullfinch,0.0,0.0,0.0,0
Periparus ater_Coal Tit,0.0,0.0,0.0,29
Lophophanes cristatus_Crested Tit,0.0,0.0,0.0,16
Regulus regulus_Goldcrest,0.333333,0.0625,0.105263,16


In [128]:
with open(f'{MODEL_PATH}/classification_report_test.json', 'w') as f:
    json.dump(clf_report, f)