In [6]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [7]:
import os

cwd = os.getcwd()
print("Current Working Directory:", cwd)
os.chdir(os.path.expanduser("~/clef/animalclef-2025"))
print("Changed Working Directory to:", os.getcwd())

Current Working Directory: /storage/home/hcoda1/2/cmarutha3/clef/animalclef-2025
Changed Working Directory to: /storage/home/hcoda1/2/cmarutha3/clef/animalclef-2025


In [8]:
from animalclef.spark import get_spark
from pyspark.sql import functions as F, Window
from pyspark.sql import SparkSession
from pathlib import Path
import numpy as np
from animalclef.dataset import split_reid_data, summarize_split
import pandas as pd
import matplotlib.pyplot as plt

### Import

In [9]:
import torch

file_name = "megadescriptor_256_non_linear_head.pt"

embeddings = torch.load(
    f"{Path.home()}/shared/animalclef/data/embeddings/triplets/megadescriptor/{file_name}"
)
display(embeddings)

OrderedDict([('projection.0.weight',
              tensor([[-0.0318,  0.0772,  0.0336,  ...,  0.0066, -0.0196,  0.0159],
                      [ 0.0166,  0.0840, -0.0193,  ...,  0.0880, -0.0845, -0.0230],
                      [-0.0386,  0.0057,  0.0082,  ..., -0.0226,  0.0102,  0.0007],
                      ...,
                      [ 0.0195,  0.0543, -0.0202,  ...,  0.0096,  0.0394,  0.0360],
                      [ 0.0170,  0.0328, -0.0354,  ..., -0.0230,  0.0125,  0.0115],
                      [ 0.0319, -0.0070, -0.0207,  ...,  0.0130,  0.0110, -0.0078]],
                     device='cuda:0')),
             ('projection.0.bias',
              tensor([-0.1010, -0.0913, -0.1084,  ..., -0.0876, -0.0703, -0.0909],
                     device='cuda:0')),
             ('projection.2.weight',
              tensor([[-3.4179e-02, -5.0550e-02, -6.2399e-02,  ..., -3.3577e-02,
                       -4.1272e-02,  4.0639e-03],
                      [ 3.8983e-03,  5.7221e-02,  6.4722e-02,  ..

In [None]:
def evaluate_model(
    model: DinoClassifier,
    test_features: Dict[str, np.ndarray],
    test_labels: np.ndarray,
    class_names: List[str] = None,
    config: Dict[str, Any] = None,
) -> Dict[str, Any]:
    """
    Evaluate a trained classifier model on test data.

    Args:
        model: Trained DinoClassifier model
        test_features: Dict with 'cls' and 'avg_patch' features for testing
        test_labels: Labels for test data
        class_names: Optional list of class names for visualization
        config: Configuration dictionary

    Returns:
        Dictionary with evaluation results
    """
    if config is None:
        config = {}

    default_config = {
        "batch_size": 64,
        "device": "cuda" if torch.cuda.is_available() else "cpu",
        "save_dir": "experiments/baseline/evaluation",
    }

    for k, v in config.items():
        default_config[k] = v
    config = default_config

    # Create test dataset and loader
    test_dataset = DinoFeatureDataset(test_features, test_labels)
    test_loader = torch.utils.data.DataLoader(
        test_dataset, batch_size=config["batch_size"], shuffle=False
    )

    # Set model to evaluation mode
    model = model.to(config["device"])
    model.eval()

    # Collect predictions
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for features, labels in test_loader:
            features = {k: v.to(config["device"]) for k, v in features.items()}
            outputs = model(features)
            _, preds = torch.max(outputs["logits"], 1)

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.numpy())

    all_preds = np.array(all_preds)
    all_labels = np.array(all_labels)

    # Compute metrics
    accuracy = np.mean(all_preds == all_labels)
    report = classification_report(
        all_labels, all_preds, target_names=class_names, output_dict=True
    )
    cm = confusion_matrix(all_labels, all_preds)

    # Save results
    os.makedirs(config["save_dir"], exist_ok=True)

    # Save confusion matrix plot
    plt.figure(figsize=(12, 10))
    sns.heatmap(
        cm,
        annot=False,
        fmt="d",
        cmap="Blues",
        xticklabels=class_names if class_names else "auto",
        yticklabels=class_names if class_names else "auto",
    )
    plt.xlabel("Predicted Label")
    plt.ylabel("True Label")
    plt.title("Confusion Matrix")
    plt.tight_layout()
    plt.savefig(os.path.join(config["save_dir"], "confusion_matrix.png"))
    plt.close()

    # Log results
    logger.info(f"Test Accuracy: {accuracy:.4f}")
    logger.info(
        f"Classification Report:\n{classification_report(all_labels, all_preds, target_names=class_names)}"
    )

    return {
        "accuracy": accuracy,
        "classification_report": report,
        "confusion_matrix": cm,
        "predictions": all_preds,
        "labels": all_labels,
    }

In [None]:
import faiss
from animalclef.metrics import BAKS, BAUS

X_train = np.stack(train_df.cls.values)
X_val = np.stack(val_df.cls.values)
X_test = np.stack(test_df.cls.values)

# Create a FAISS index for efficient nearest neighbor search
index = faiss.IndexFlatL2(X_train.shape[1])
index.add(X_train)

# Perform a search for the validation set
# use the nearest neighbor for now for voting
k = 1
# Distances and indices for validation set
dist_val, idx_val = index.search(X_val, k)
display(dist_val)

# Calculate the accuracy for validation and test sets
# do the actual prediction

# identities in val not in train
predictions_val = train_df.iloc[idx_val.flatten()]["identity"].values
identity_val_only = sorted(
    set(val_df.identity.unique()) - set(train_df.identity.unique())
)

display(
    BAKS(val_df["identity"].values, predictions_val, identity_val_only),
    BAUS(val_df["identity"].values, predictions_val, identity_val_only, "unknown"),
)

### Prediction

In [None]:
# save this result
unknown_df["identity"] = predictions
output_path = (
    Path.home() / "shared/animalclef/data/predictions/20250516-megadescriptor-l-384-triplets.csv"
)
output_path.parent.mkdir(parents=True, exist_ok=True)
unknown_df[["image_id", "identity"]].to_csv(output_path, index=False, header=True)

In [None]:
! kaggle competitions submit -c animal-clef-2025 -f {output_path} -m "baseline salamanders only"