In [None]:
!pip install grad-cam pytest

In [None]:
import os
import numpy as np
import pandas as pd
import torchaudio
import torch
import torch.nn.functional as F
from collections import defaultdict
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

NUM_CLASSES = 206
WAV_LEN = 320000
MEL_SHAPE = (1, 64, 313)


In [None]:
TEST_MANIFEST = "/mnt/BirdCLEF/birdclef_dataset/features_sampled/manifest_test.csv"
TAXONOMY_CSV = "/mnt/BirdCLEF/birdclef_dataset/features_sampled/taxonomy.csv"


In [None]:
tax = pd.read_csv(TAXONOMY_CSV)
CLASSES = sorted(tax["primary_label"].astype(str).tolist())

In [None]:
test_manifest = pd.read_csv(TEST_MANIFEST)

In [None]:
def preprocess_sample(sample):
    wav_path = os.path.join("/mnt/BirdCLEF/birdclef_dataset/features_sampled/denoised", sample.audio_path.lstrip(os.sep))
    wav, sr = torchaudio.load(wav_path)
    wav = wav.mean(dim=0)
    T = sr * 10
    if wav.size(0) < T:
        wav = F.pad(wav, (0, T - wav.size(0)))
    else:
        wav = wav[:T]
    wav = (wav - wav.mean()) / wav.std().clamp_min(1e-6)
    return wav.unsqueeze(0)
def run_full_inference(wav_tensor):
    # Placeholder: replace with real model inference logic
    # Return random logits for demo purpose
    logits = torch.randn(1, NUM_CLASSES)
    return logits

In [None]:
total_correct = 0
total_samples = 0
class_hits = defaultdict(int)
class_counts = defaultdict(int)

for idx, row in test_manifest.iterrows():
    true_label = int(row["label"])
    wav_tensor = preprocess_sample(row)
    try:
        logits = run_full_inference(wav_tensor)
        pred_label = int(torch.argmax(logits))
        class_counts[true_label] += 1
        if pred_label == true_label:
            total_correct += 1
            class_hits[true_label] += 1
        total_samples += 1
    except Exception as e:
        print(f"Error at index {idx}: {e}")

In [None]:
overall_acc = total_correct / total_samples
print(f"\n Overall Accuracy: {overall_acc:.4f} ({total_correct}/{total_samples})")

per_class_acc = {
    CLASSES[i]: class_hits[i] / class_counts[i] if class_counts[i] > 0 else 0.0
    for i in range(NUM_CLASSES)
}

least_accurate = sorted(per_class_acc.items(), key=lambda x: x[1])[:20]

print("\n Top 20 Least Accurate Classes:")
for cls, acc in least_accurate:
    print(f"{cls:<30} Acc: {acc:.4f}")

In [None]:
def model_wrapper(wav_batch):
    # Convert input list of numpy arrays into batched tensor
    tensor_batch = torch.tensor(np.stack(wav_batch), dtype=torch.float32)
    with torch.no_grad():
        logits = run_full_inference(tensor_batch)
        probs = torch.softmax(logits, dim=1).cpu().numpy()
    return probs

# Prepare background and test samples
background_samples = [preprocess_sample(test_manifest.iloc[i]).squeeze(0).numpy() for i in range(5)]
test_sample = preprocess_sample(test_manifest.iloc[42]).squeeze(0).numpy()

explainer = shap.Explainer(model_wrapper, background_samples)
shap_values = explainer([test_sample])

# Plot SHAP values
shap.plots.waterfall(shap_values[0])

In [None]:

CUSTOM_TEST_ROOT = "/mnt/BirdCLEF/custom_templates"
def evaluate_with_ground_truth(folder_name):
    folder_path = os.path.join(CUSTOM_TEST_ROOT, folder_name)
    y_true, y_pred = [], []

    for fname in os.listdir(folder_path):
        if not fname.endswith(".wav"): continue
        path = os.path.join(folder_path, fname)
        try:
            wav = preprocess_sample(path)
            logits = run_full_inference(wav)
            pred_label = int(torch.argmax(logits))
            y_pred.append(pred_label)
            y_true.append(CLASSES.index(folder_name))  # use folder name as ground truth
        except Exception as e:
            print(f"Error on {fname}: {e}")
    
    return y_true, y_pred

print("\n📈 Accuracy per template folder:\n")
for group in ["insects", "mammalia", "amphibia"]:
    true, pred = evaluate_with_ground_truth(group)
    if true:
        acc = accuracy_score(true, pred)
        print(f"{group.capitalize():<10} → Accuracy: {acc:.4f} ({len(true)} samples)")
        print(classification_report(true, pred, target_names=[group]))
    else:
        print(f"{group.capitalize():<10} → No valid samples")

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

# === Cell 9: Confusion Matrix for One Group ===
def plot_confusion_for_group(group):
    folder = os.path.join(CUSTOM_TEST_ROOT, group)
    y_true, y_pred = [], []

    for fname in os.listdir(folder):
        if not fname.endswith(".wav"): continue
        path = os.path.join(folder, fname)
        try:
            wav = preprocess_sample(path)
            logits = run_full_inference(wav)
            pred = int(torch.argmax(logits))
            y_pred.append(pred)
            y_true.append(CLASSES.index(group))  # assumes folder name is the true label
        except:
            continue

    if y_true:
        cm = confusion_matrix(y_true, y_pred, labels=[CLASSES.index(group)])
        plt.figure(figsize=(6, 4))
        sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=[group], yticklabels=[group])
        plt.title(f"Confusion Matrix for '{group}'")
        plt.xlabel("Predicted Label")
        plt.ylabel("True Label")
        plt.show()
    else:
        print(f"No valid samples in {group}/")

# Run for a specific group
plot_confusion_for_group("insects")


In [None]:
# === Cell 10: Evaluate on Folder with Amphibia-like Sounds ===
def evaluate_confused_folder(folder_name, expected_class):
    folder = os.path.join(CUSTOM_TEST_ROOT, folder_name)
    y_true, y_pred = [], []

    for fname in os.listdir(folder):
        if not fname.endswith(".wav"): continue
        path = os.path.join(folder, fname)
        try:
            wav = preprocess_sample(path)
            logits = run_full_inference(wav)
            pred = int(torch.argmax(logits))
            y_pred.append(pred)
            y_true.append(CLASSES.index(expected_class))
        except Exception as e:
            print(f"Error on {fname}: {e}")

    if y_true:
        acc = accuracy_score(y_true, y_pred)
        print(f"\n Accuracy on '{folder_name}' (expected='{expected_class}'): {acc:.4f} ({len(y_true)} samples)")

        # Show confusion distribution
        pred_labels = [CLASSES[p] for p in y_pred]
        error_counts = pd.Series(pred_labels).value_counts().head(10)
        print("\n Top 10 Predicted Classes (Confusion Candidates):")
        for lbl, count in error_counts.items():
            print(f"{lbl:<30} {count} predictions")

        # Optional barplot
        plt.figure(figsize=(10, 4))
        sns.barplot(x=error_counts.values, y=error_counts.index)
        plt.title(f"Most Common Misclassifications for '{folder_name}'")
        plt.xlabel("Prediction Count")
        plt.ylabel("Predicted Label")
        plt.tight_layout()
        plt.show()
    else:
        print(f"No valid audio samples in '{folder_name}'")

# Run on a confusion-prone folder
evaluate_confused_folder("sound_similar_to_amphibia", expected_class="amphibia")


In [None]:
!pytest --verbose --tb=no tests/

In [None]:
!pytest --verbose --lf --tb=no tests/