# Classification report comparison with final tests

In [4]:
import json
import pandas as pd
import numpy as np

In [5]:
def load_report(path):
    with open(path, 'r') as f:
        return json.load(f)

In [30]:
var = "wabad"
report1 = load_report(f'/home/giacomoschiavo/finetuning-BirdNET/models/DeeperCNN/classification_report_{var}.json')
report2 = load_report(f'/home/giacomoschiavo/finetuning-BirdNET/models/finetuned/{var}/classification_report.json')

# 1. Metrics Table

In [31]:
print(f'''- Dataset {var} 
      Precision: CNN = {report1["samples avg"]["precision"]:.4f}, BirdNET FT = {report2["samples avg"]["precision"]:.4f}
      Recall: CNN = {report1["samples avg"]["recall"]:.4f}, BirdNET FT = {report2["samples avg"]["recall"]:.4f}
      F1: CNN = {report1["samples avg"]["f1-score"]:.4f}, BirdNET FT = {report2["samples avg"]["f1-score"]:.4f}''')

- Dataset wabad 
      Precision: CNN = 0.5663, BirdNET FT = 0.5863
      Recall: CNN = 0.5930, BirdNET FT = 0.6739
      F1: CNN = 0.5698, BirdNET FT = 0.6055


In [32]:
print(f'''- Dataset {var} 
      Precision: CNN = {report1["macro avg"]["precision"]:.4f}, BirdNET FT = {report2["macro avg"]["precision"]:.4f}
      Recall: CNN = {report1["macro avg"]["recall"]:.4f}, BirdNET FT = {report2["macro avg"]["recall"]:.4f}
      F1: CNN = {report1["macro avg"]["f1-score"]:.4f}, BirdNET FT = {report2["macro avg"]["f1-score"]:.4f}''')

- Dataset wabad 
      Precision: CNN = 0.2586, BirdNET FT = 0.4050
      Recall: CNN = 0.2567, BirdNET FT = 0.2771
      F1: CNN = 0.2366, BirdNET FT = 0.2609


In [110]:
metrics = ['accuracy', 'recall', 'f1-score']
data = {
    'FineTuned': [report2['macro avg']['precision'], report2['macro avg']['recall'], report2['macro avg']['f1-score']],
    'VanillaCNN': [report1['macro avg']['precision'], report1['macro avg']['recall'], report1['macro avg']['f1-score']],
}

df_comparison = pd.DataFrame(data, index=metrics)
df_comparison

Unnamed: 0,FineTuned,VanillaCNN
accuracy,0.659532,0.352183
recall,0.155476,0.207238
f1-score,0.211653,0.182876


# 2. Classification Reports Comparison

In [111]:
def compare_classification_reports(report1, report2):
    classes = set(report1.keys()).intersection(report2.keys())
    metrics = ['precision', 'recall', 'f1-score']
    
    diffs = {
        cls: {
            metric: report1[cls][metric] - report2[cls][metric]
            for metric in metrics
        }
        for cls in classes
    }

    df = pd.DataFrame.from_dict(diffs, orient='index')
    df_sorted = df.sort_values(by='f1-score', ascending=False)

    print("\n📈 Class-wise F1-score differences:")
    print(df_sorted)

    # Macro averages
    macro_avg1 = np.mean([[v[m] for m in metrics] for v in report1.values()], axis=0)
    macro_avg2 = np.mean([[v[m] for m in metrics] for v in report2.values()], axis=0)
    macro_delta = dict(zip(metrics, (macro_avg2 - macro_avg1)))

    print("\n📊 Macro average changes:")
    for metric in metrics:
        delta = macro_delta[metric]
        status = "⬆️ improved" if delta > 0 else "⬇️ worse"
        print(f"{metric.capitalize()}: {delta:.4f} ({status})")

    return df_sorted

In [112]:
# how is modelA with respect to modelB? modelA - modelB (piu positivi e meglio per A)
comparison_df = compare_classification_reports(report1, report2)


📈 Class-wise F1-score differences:
                                            precision    recall  f1-score
Fringilla coelebs_Common Chaffinch          -0.175946  0.617619  0.557606
Periparus ater_Coal Tit                      0.833333  0.357143  0.500000
Sylvia atricapilla_Eurasian Blackcap        -0.658615  0.415822  0.352610
Erithacus rubecula_European Robin           -0.118902  0.336331  0.343816
Dryocopus martius_Black Woodpecker          -0.200000  0.095238  0.133779
Turdus merula_Eurasian Blackbird            -0.142857  0.057143  0.102559
Troglodytes troglodytes_Eurasian Wren        0.000000  0.045045  0.084707
Certhia familiaris_Eurasian Treecreeper      0.036145  1.000000  0.069767
Turdus philomelos_Song Thrush                0.029126  0.103448  0.045455
micro avg                                   -0.511260  0.036296  0.012750
Regulus regulus_Goldcrest                    0.000000  0.000000  0.000000
Muscicapa striata_Spotted Flycatcher         0.000000  0.000000  0.000000
sa

# Classification report visualization

In [113]:
df_report1 = pd.DataFrame.from_dict(report1, orient='index')
print(modelA)
df_report1.sort_index()

augm


Unnamed: 0,precision,recall,f1-score,support
Aeroplane,0.0,0.0,0.0,22
Certhia familiaris_Eurasian Treecreeper,0.036145,1.0,0.069767,3
Dendrocopos major_Great Spotted Woodpecker,0.0,0.0,0.0,25
Dryocopus martius_Black Woodpecker,0.8,0.190476,0.307692,21
Erithacus rubecula_European Robin,0.881098,0.519784,0.653846,556
Fringilla coelebs_Common Chaffinch,0.763184,0.718838,0.740347,1067
Lophophanes cristatus_Crested Tit,0.216216,0.571429,0.313725,14
Loxia curvirostra_Common Crossbill,0.5,0.019231,0.037037,52
Muscicapa striata_Spotted Flycatcher,0.0,0.0,0.0,108
,0.0,0.0,0.0,4907


In [114]:
df_report2 = pd.DataFrame.from_dict(report2, orient='index')
df_report2.sort_index()

Unnamed: 0,precision,recall,f1-score,support
Certhia familiaris_Eurasian Treecreeper,0.0,0.0,0.0,3
Dendrocopos major_Great Spotted Woodpecker,1.0,0.92,0.958333,25
Dryocopus martius_Black Woodpecker,1.0,0.095238,0.173913,21
Erithacus rubecula_European Robin,1.0,0.183453,0.31003,556
Fringilla coelebs_Common Chaffinch,0.93913,0.101218,0.182741,1067
Lophophanes cristatus_Crested Tit,1.0,0.357143,0.526316,14
Muscicapa striata_Spotted Flycatcher,0.0,0.0,0.0,108
Periparus ater_Coal Tit,0.0,0.0,0.0,28
Phylloscopus collybita_Common Chiffchaff,0.973856,0.221068,0.360339,674
Regulus ignicapilla_Common Firecrest,0.98,0.411765,0.579882,238
