In [1]:
import os
import torch
import pandas as pd

In [2]:
HF_DATASET_DIR = "/workspaces/ASE-Model-Retrieval/data/imagenet/.cache/hf_datasets"
MODELS_CSV_PATH = "/workspaces/ASE-Model-Retrieval/models/model-list.csv"
META_FEATURES_DIR = (
    "/workspaces/ASE-Model-Retrieval/meta-embedding/.cache/task_embeddings"
)
EVALUATION_RESULTS_DIR = "/workspaces/ASE-Model-Retrieval/data/evaluation_results"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
N_ESTIMATORS = 100
NUM_CLASSES = 93

In [3]:
def load_meta_features(path):
    all_features = sorted([f for f in os.listdir(path) if f.endswith(".pt")])
    print(f"Found {len(all_features)} meta-feature files.")

    meta_features = torch.stack(
        [
            torch.load(os.path.join(path, feature), map_location=DEVICE).flatten()
            for feature in all_features
        ],
        dim=0,
    )
    return meta_features


meta_features = load_meta_features(META_FEATURES_DIR)
meta_features.shape

Found 312 meta-feature files.


torch.Size([312, 4608])

In [4]:
def load_evaluation_data(path):
    all_results = sorted([f for f in os.listdir(path) if f.endswith(".csv")])
    print(f"Found {len(all_results)} meta-feature files.")

    all_results = [
        pd.read_csv(os.path.join(path, result), delimiter=",") for result in all_results
    ]

    return all_results


evaluation_data = load_evaluation_data(EVALUATION_RESULTS_DIR)
evaluation_data

Found 312 meta-feature files.


[                                             model_id  \
 0                timm/mobilenetv3_small_100.lamb_in1k   
 1                               timm/resnet50.a1_in1k   
 2                           timm/inception_v3.tv_in1k   
 3   timm/swinv2_large_window12to16_192to256.ms_in2...   
 4                        timm/efficientnet_b0.ra_in1k   
 ..                                                ...   
 88                    timm/focalnet_small_lrf.ms_in1k   
 89                         timm/darknetaa53.c2ns_in1k   
 90           timm/tiny_vit_21m_384.dist_in22k_ft_in1k   
 91                          timm/regnetz_c16.ra3_in1k   
 92                       timm/mobileone_s4.apple_in1k   
 
                                               dataset  Top-1 Accuracy  \
 0   subclass_Afghan_hound-Egyptian-Lhasa-groenenda...           0.604   
 1   subclass_Afghan_hound-Egyptian-Lhasa-groenenda...           0.764   
 2   subclass_Afghan_hound-Egyptian-Lhasa-groenenda...           0.728   
 3   s

In [5]:
evaluation_labels = torch.tensor([
    torch.tensor(df["Top-1 Accuracy"].values).argmax().item()
    for df in evaluation_data
]) 

evaluation_labels.shape

torch.Size([312])

In [6]:
real_accuracy_matrix = torch.stack([
    torch.tensor(df["Top-1 Accuracy"].values).float()
    for df in evaluation_data
], dim=1)
real_accuracy_matrix = real_accuracy_matrix.T
real_accuracy_matrix.shape

torch.Size([312, 93])

In [7]:
from sklearn.ensemble import BaggingClassifier
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler


In [8]:
model = BaggingClassifier(
    estimator=SVC(probability=True),
    max_samples=1.0,
    max_features=1.0,  
    n_estimators=N_ESTIMATORS,
    bootstrap=False,
    bootstrap_features=True,
    n_jobs=-1,
)
model

In [9]:
pipe_lr = make_pipeline(StandardScaler(), model)
pipe_lr

In [10]:
class Model:
    def __init__(self, model_id, pred_perf, real_perf):
        self.model_id = model_id
        self.pred_perf = pred_perf
        self.real_perf = real_perf

In [24]:
def dcg_at_k(relevance: torch.Tensor, k: int) -> float:
    """Compute DCG using raw relevance values (assumes higher = better)"""
    rel_k = relevance[:k]
    denom = torch.log2(torch.arange(2, k + 2).float())
    return torch.sum(rel_k / denom)



def NDCG(model_list, k=5, sample_size=1):
    ndcg_scores = []

    for _ in range(sample_size):
        # Get scores
        predicted_scores = torch.tensor([m.pred_perf for m in model_list])
        true_scores = torch.tensor([m.real_perf for m in model_list])

        # Ideal relevance order (ground truth sorted descending)
        _, ideal_indices = torch.sort(true_scores, descending=True)
        ideal_relevance = true_scores[ideal_indices]

        # Predicted relevance order (based on SDS scores)
        _, predicted_indices = torch.sort(predicted_scores, descending=True)
        predicted_relevance = true_scores[predicted_indices]

        # Normalize scores (0 to 1) before computing DCG
        max_val = true_scores.max()
        if max_val > 0:
            predicted_relevance = predicted_relevance / max_val
            ideal_relevance = ideal_relevance / max_val

        dcg = dcg_at_k(predicted_relevance, k)
        idcg = dcg_at_k(ideal_relevance, k)

        ndcg = (dcg / idcg).item() if idcg > 0 else 0.0
        ndcg_scores.append(ndcg)

    return ndcg_scores

def MRR(model_list, sample_size=1):
    mrr = []

    for _ in range(sample_size):
        SDS_scores = torch.tensor([m.pred_perf for m in model_list])
        real_scores = torch.tensor([m.real_perf for m in model_list])

        _, predicted_indices = torch.sort(SDS_scores, descending=True)
        _, ideal_indices = torch.sort(real_scores, descending=True)

        # Find the rank of the best real model in the predicted list
        top_real_model = ideal_indices[0].item()
        rank = (predicted_indices == top_real_model).nonzero(as_tuple=True)[
            0
        ].item() + 1
        mrr.append(1.0 / rank)

    return mrr


def MAP(model_list, k=3, sample_size=1):
    ap_scores = []

    for _ in range(sample_size):
        SDS_scores = torch.tensor([m.pred_perf for m in model_list])
        real_scores = torch.tensor([m.real_perf for m in model_list])

        _, predicted_indices = torch.sort(SDS_scores, descending=True)
        _, ideal_indices = torch.sort(real_scores, descending=True)

        predicted_topk = predicted_indices[:k]
        ideal_topk = set(ideal_indices[:k].tolist())

        hits = 0
        precision_sum = 0.0

        for n, idx in enumerate(predicted_topk):
            if idx.item() in ideal_topk:
                hits += 1
                precision_sum += hits / (n + 1)

        ap = precision_sum / len(ideal_topk) if hits > 0 else 0.0
        ap_scores.append(ap)

    return ap_scores

In [12]:
from sklearn.model_selection import train_test_split

In [13]:
X = meta_features.cpu().numpy()
Y = evaluation_labels.cpu().numpy()
X_train, X_test, y_train, y_test, idx_train, idx_test = None, None, None, None, None, None,
for _ in range(100):
    X_train, X_test, y_train, y_test, idx_train, idx_test = train_test_split(
        X, Y, range(len(meta_features)), test_size=0.2
    )

# print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)
pipe_lr.fit(X_train, y_train)
prediction_proba = pipe_lr.predict_proba(X_test)
prediction_proba

array([[0.30388727, 0.00766693, 0.0153981 , ..., 0.0182836 , 0.05880346,
        0.00556569],
       [0.31442537, 0.00776948, 0.01861366, ..., 0.02000571, 0.0729367 ,
        0.00695087],
       [0.31441018, 0.0069152 , 0.01544389, ..., 0.01288722, 0.06467124,
        0.00780659],
       ...,
       [0.27920351, 0.00539938, 0.01537724, ..., 0.01632969, 0.07675068,
        0.00420905],
       [0.31437191, 0.0063739 , 0.01845937, ..., 0.01354769, 0.0655178 ,
        0.01200281],
       [0.30393202, 0.00579887, 0.01644595, ..., 0.01530357, 0.06507521,
        0.00959259]], shape=(63, 22))

In [15]:
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
import numpy as np

In [16]:
def train_custom_bagging_ensemble(X_train, y_train, n_estimators=N_ESTIMATORS):

    estimators = []
    scaler = StandardScaler().fit(X_train)
    X_train_scaled = scaler.transform(X_train)

    for seed in range(N_ESTIMATORS):
        np.random.seed(seed)
        indices = np.random.choice(len(X_train), len(X_train), replace=True)
        X_sub = X_train_scaled[indices]
        y_sub = y_train[indices]

        clf = SVC(probability=True)
        clf.fit(X_sub, y_sub)
        estimators.append(clf)
    
    return estimators, scaler

In [17]:
estimators, scaler = train_custom_bagging_ensemble(X_train, y_train)
print(estimators)
print(scaler)

[SVC(probability=True), SVC(probability=True), SVC(probability=True), SVC(probability=True), SVC(probability=True), SVC(probability=True), SVC(probability=True), SVC(probability=True), SVC(probability=True), SVC(probability=True), SVC(probability=True), SVC(probability=True), SVC(probability=True), SVC(probability=True), SVC(probability=True), SVC(probability=True), SVC(probability=True), SVC(probability=True), SVC(probability=True), SVC(probability=True), SVC(probability=True), SVC(probability=True), SVC(probability=True), SVC(probability=True), SVC(probability=True), SVC(probability=True), SVC(probability=True), SVC(probability=True), SVC(probability=True), SVC(probability=True), SVC(probability=True), SVC(probability=True), SVC(probability=True), SVC(probability=True), SVC(probability=True), SVC(probability=True), SVC(probability=True), SVC(probability=True), SVC(probability=True), SVC(probability=True), SVC(probability=True), SVC(probability=True), SVC(probability=True), SVC(probab

In [18]:
def predict_proba_bagging(estimators, scaler, X_test):
    X_test_scaled = scaler.transform(X_test)
    all_probs = []
    
    for clf in estimators:
        proba = clf.predict_proba(X_test_scaled)  # shape: [n_samples, num_classes_seen]
        
        # Map to full class set
        probs_full = np.zeros((X_test.shape[0], NUM_CLASSES))
        for i, cls in enumerate(clf.classes_):
            probs_full[:, cls] = proba[:, i]
        
        all_probs.append(probs_full)
    
    # Average across all estimators
    avg_probs = np.mean(all_probs, axis=0)  # shape: [n_samples, 93]
    return avg_probs

In [19]:
prediction_proba = predict_proba_bagging(estimators, scaler, X_test)
prediction_proba

array([[0.        , 0.        , 0.        , ..., 0.03191875, 0.00618126,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.09409027, 0.00672574,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.05862413, 0.005344  ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.10232638, 0.00799922,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.05981461, 0.00524195,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.05327512, 0.00557742,
        0.        ]], shape=(63, 93))

In [30]:
ndcg5_t, mrr_t, map_t = [], [], []
acc_list = []
for i, task_index in enumerate(idx_test):
    model_list = []

    for model_id in range(NUM_CLASSES):
        pred_perf = float(prediction_proba[i][model_id])
        # print(f"Pred perf: {pred_perf}")
        real_perf = real_accuracy_matrix[task_index][model_id].item()
        # print(f"Real perf: {real_perf}")

        model = Model(model_id=str(model_id), pred_perf=pred_perf, real_perf=real_perf)
        model_list.append(model)

    ndcg5_t.extend(NDCG(model_list, k=5, sample_size=1))
    mrr_t.extend(MRR(model_list, sample_size=1))
    map_t.extend(MAP(model_list, k=3, sample_size=1))

acc = pipe_lr.score(X_test, y_test)
acc_list.append(acc)

In [28]:
task_idx = 0
sds_scores = prediction_proba[task_idx]
real_scores = real_accuracy_matrix[idx_test[task_idx]]

top_pred = torch.topk(torch.tensor(sds_scores), 5).indices
top_real = torch.topk(real_scores, 5).indices

print("Top predicted models:", top_pred.tolist())
print("Top real models:", top_real.tolist())

Top predicted models: [3, 72, 65, 34, 90]
Top real models: [72, 15, 3, 34, 62]


In [31]:
print("acc: ", sum(acc_list) / len(acc_list))
print("ndcg5: ", sum(ndcg5_t) / len(ndcg5_t))
print("mrr: ", sum(mrr_t) / len(mrr_t))
print("map: ", sum(map_t) / len(map_t))

acc:  0.31746031746031744
ndcg5:  0.9924523915563311
mrr:  0.5373795904103376
map:  0.41710758377425067


In [34]:
print("len(acc_list):", len(acc_list))
print("len(ndcg5_t):", len(ndcg5_t))
print("len(mrr_t):", len(mrr_t))
print("len(map_t):", len(map_t))

len(acc_list): 1
len(ndcg5_t): 63
len(mrr_t): 63
len(map_t): 63


In [32]:
import pickle

with open("acc_list.pkl", "wb") as f:
    pickle.dump(acc_list, f)

with open("ndcg5_list.pkl", "wb") as f:
    pickle.dump(ndcg5_t, f)

with open("mrr_list.pkl", "wb") as f:
    pickle.dump(mrr_t, f)

with open("map_list.pkl", "wb") as f:
    pickle.dump(map_t, f)

In [35]:
# Save per-task ranking metrics
results_df = pd.DataFrame({
    "NDCG@5": ndcg5_t,
    "MRR": mrr_t,
    "MAP@3": map_t
})
results_df.to_csv("sds_metrics_per_task.csv", index=False)

# Save overall accuracy separately
overall_df = pd.DataFrame({"Accuracy": [acc_list[0]]})
overall_df.to_csv("overall_accuracy.csv", index=False)

In [38]:
import collections
counts = collections.Counter(Y)
print("Label distribution:")
for model_id in range(93):
    print(f"Model {model_id}: {counts[model_id]} times")

Label distribution:
Model 0: 0 times
Model 1: 0 times
Model 2: 0 times
Model 3: 97 times
Model 4: 0 times
Model 5: 0 times
Model 6: 0 times
Model 7: 0 times
Model 8: 1 times
Model 9: 1 times
Model 10: 0 times
Model 11: 0 times
Model 12: 5 times
Model 13: 0 times
Model 14: 0 times
Model 15: 7 times
Model 16: 0 times
Model 17: 0 times
Model 18: 0 times
Model 19: 1 times
Model 20: 0 times
Model 21: 0 times
Model 22: 2 times
Model 23: 0 times
Model 24: 12 times
Model 25: 0 times
Model 26: 0 times
Model 27: 0 times
Model 28: 0 times
Model 29: 0 times
Model 30: 0 times
Model 31: 3 times
Model 32: 14 times
Model 33: 6 times
Model 34: 13 times
Model 35: 0 times
Model 36: 0 times
Model 37: 1 times
Model 38: 0 times
Model 39: 0 times
Model 40: 1 times
Model 41: 0 times
Model 42: 0 times
Model 43: 0 times
Model 44: 1 times
Model 45: 0 times
Model 46: 0 times
Model 47: 0 times
Model 48: 0 times
Model 49: 0 times
Model 50: 0 times
Model 51: 0 times
Model 52: 0 times
Model 53: 0 times
Model 54: 0 ti

In [39]:
top_models = []
for df in evaluation_data:
    top_model = df["Top-1 Accuracy"].idxmax()
    top_models.append(top_model)

model_counts = collections.Counter(top_models)
for model_id, count in model_counts.most_common():
    print(f"Model {model_id}: {count} times top-1")

Model 3: 97 times top-1
Model 72: 69 times top-1
Model 65: 33 times top-1
Model 90: 21 times top-1
Model 32: 14 times top-1
Model 34: 13 times top-1
Model 24: 12 times top-1
Model 62: 8 times top-1
Model 15: 7 times top-1
Model 33: 6 times top-1
Model 88: 6 times top-1
Model 12: 5 times top-1
Model 69: 3 times top-1
Model 31: 3 times top-1
Model 87: 3 times top-1
Model 91: 2 times top-1
Model 22: 2 times top-1
Model 82: 2 times top-1
Model 19: 1 times top-1
Model 40: 1 times top-1
Model 37: 1 times top-1
Model 8: 1 times top-1
Model 44: 1 times top-1
Model 9: 1 times top-1


In [40]:
import scipy.stats

entropies = []
for probs in prediction_proba:
    entropies.append(scipy.stats.entropy(probs))

print(f"Mean entropy of predicted rankings: {np.mean(entropies):.4f}")

Mean entropy of predicted rankings: 2.0780


In [41]:
from scipy.spatial.distance import cosine
similarities = []

for i in range(len(prediction_proba)):
    for j in range(i+1, len(prediction_proba)):
        sim = 1 - cosine(prediction_proba[i], prediction_proba[j])
        similarities.append(sim)

print(f"Mean predicted ranking similarity across tasks: {np.mean(similarities):.4f}")

Mean predicted ranking similarity across tasks: 0.9330


In [42]:
top1_preds = [np.argmax(p) for p in prediction_proba]
top3_sets = [tuple(np.argsort(p)[-3:][::-1]) for p in prediction_proba]

print(f"Unique top-1 predictions: {len(set(top1_preds))}")
print(f"Unique top-3 predictions: {len(set(top3_sets))}")

Unique top-1 predictions: 2
Unique top-3 predictions: 7
